*********************EJERCICIO 2********************
***********************MODELOS**********************
*********Exploración y Limpieza de datos************
#Librerias
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(ggpubr)
library(modeldata)
library(caTools)
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✓ broom 0.7.6 ✓ rsample 0.1.0
## ✓ dials 0.0.9 ✓ tune 0.1.5
## ✓ infer 0.5.4 ✓ workflows 0.2.2
## ✓ parsnip 0.1.6 ✓ workflowsets 0.0.2
## ✓ recipes 0.1.16 ✓ yardstick 0.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
library(data.tree)
library(DiagrammeR)
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
#Inicializacion y exploracion de datos
turbines <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## objectid = col_double(),
## province_territory = col_character(),
## project_name = col_character(),
## total_project_capacity_mw = col_double(),
## turbine_identifier = col_character(),
## turbine_number_in_project = col_character(),
## turbine_rated_capacity_k_w = col_double(),
## rotor_diameter_m = col_double(),
## hub_height_m = col_double(),
## manufacturer = col_character(),
## model = col_character(),
## commissioning_date = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## notes = col_character()
## )
colnames(turbines)
## [1] "objectid" "province_territory"
## [3] "project_name" "total_project_capacity_mw"
## [5] "turbine_identifier" "turbine_number_in_project"
## [7] "turbine_rated_capacity_k_w" "rotor_diameter_m"
## [9] "hub_height_m" "manufacturer"
## [11] "model" "commissioning_date"
## [13] "latitude" "longitude"
## [15] "notes"
summary(turbines)
## objectid province_territory project_name total_project_capacity_mw
## Min. : 1 Length:6698 Length:6698 Min. : 0.3
## 1st Qu.:1675 Class :character Class :character 1st Qu.: 70.5
## Median :3350 Mode :character Mode :character Median :102.0
## Mean :3350 Mean :134.8
## 3rd Qu.:5024 3rd Qu.:181.5
## Max. :6698 Max. :350.0
##
## turbine_identifier turbine_number_in_project turbine_rated_capacity_k_w
## Length:6698 Length:6698 Min. : 65
## Class :character Class :character 1st Qu.:1600
## Mode :character Mode :character Median :1880
## Mean :1967
## 3rd Qu.:2300
## Max. :3750
## NA's :220
## rotor_diameter_m hub_height_m manufacturer model
## Min. : 15.00 Min. : 24.50 Length:6698 Length:6698
## 1st Qu.: 80.00 1st Qu.: 80.00 Class :character Class :character
## Median : 90.00 Median : 80.00 Mode :character Mode :character
## Mean : 88.62 Mean : 83.34
## 3rd Qu.:100.00 3rd Qu.: 92.00
## Max. :141.00 Max. :132.00
##
## commissioning_date latitude longitude notes
## Length:6698 Min. :42.00 Min. :-135.23 Length:6698
## Class :character 1st Qu.:43.98 1st Qu.: -84.41 Class :character
## Mode :character Median :46.67 Median : -80.67 Mode :character
## Mean :46.76 Mean : -83.03
## 3rd Qu.:49.17 3rd Qu.: -67.85
## Max. :64.49 Max. : -52.97
##
distinct(turbines, manufacturer) #para validar si se necesita limpiar
## # A tibble: 23 x 1
## manufacturer
## <chr>
## 1 Bonus
## 2 Vestas
## 3 Nordex
## 4 Enercon
## 5 GE
## 6 Lagerwey
## 7 Gamesa
## 8 Siemens
## 9 Leitwind
## 10 Senvion
## # … with 13 more rows
distinct(turbines, province_territory) #para validar si se necesita limpiar
## # A tibble: 12 x 1
## province_territory
## <chr>
## 1 Alberta
## 2 British Columbia
## 3 Manitoba
## 4 New Brunswick
## 5 Newfoundland and Labrador
## 6 Northwest Territories
## 7 Nova Scotia
## 8 Ontario
## 9 Prince Edward Island
## 10 Quebec
## 11 Saskatchewan
## 12 Yukon
#Limpieza de la columna manufacturer
turbines <- turbines %>%
mutate(manufacturer = str_replace(manufacturer, "Acciona Wind Power", "Acciona"))
turbines <- turbines %>%
mutate(manufacturer = str_replace(manufacturer, "Samsung Renewable Energy", "Samsung"))
Viendo las columnas del dataset, mi hipótesis es que las variables más relevantes para pronosticar la capacidad de la turbina son rotor_diameter_m,hub_height_m, manufacturer y model. Tal vez incluso commissioning_date, supongo que entre más nuevas tienen más capacidad. Creo que vale la pena explorar esta columna:
min(turbines$commissioning_date)
## [1] "1993"
max(turbines$commissioning_date)
## [1] "2019"
distinct(turbines, commissioning_date)
## # A tibble: 35 x 1
## commissioning_date
## <chr>
## 1 1993
## 2 1997
## 3 1998
## 4 2000
## 5 2001
## 6 2002
## 7 2003
## 8 2004
## 9 2006
## 10 2007
## # … with 25 more rows
class(turbines$commissioning_date)
## [1] "character"
turbines <- turbines %>%
mutate(commissioning_date = as.numeric(commissioning_date))
## Warning in base::as.numeric(x): NAs introduced by coercion
summary(turbines)
## objectid province_territory project_name total_project_capacity_mw
## Min. : 1 Length:6698 Length:6698 Min. : 0.3
## 1st Qu.:1675 Class :character Class :character 1st Qu.: 70.5
## Median :3350 Mode :character Mode :character Median :102.0
## Mean :3350 Mean :134.8
## 3rd Qu.:5024 3rd Qu.:181.5
## Max. :6698 Max. :350.0
##
## turbine_identifier turbine_number_in_project turbine_rated_capacity_k_w
## Length:6698 Length:6698 Min. : 65
## Class :character Class :character 1st Qu.:1600
## Mode :character Mode :character Median :1880
## Mean :1967
## 3rd Qu.:2300
## Max. :3750
## NA's :220
## rotor_diameter_m hub_height_m manufacturer model
## Min. : 15.00 Min. : 24.50 Length:6698 Length:6698
## 1st Qu.: 80.00 1st Qu.: 80.00 Class :character Class :character
## Median : 90.00 Median : 80.00 Mode :character Mode :character
## Mean : 88.62 Mean : 83.34
## 3rd Qu.:100.00 3rd Qu.: 92.00
## Max. :141.00 Max. :132.00
##
## commissioning_date latitude longitude notes
## Min. :1993 Min. :42.00 Min. :-135.23 Length:6698
## 1st Qu.:2009 1st Qu.:43.98 1st Qu.: -84.41 Class :character
## Median :2012 Median :46.67 Median : -80.67 Mode :character
## Mean :2011 Mean :46.76 Mean : -83.03
## 3rd Qu.:2014 3rd Qu.:49.17 3rd Qu.: -67.85
## Max. :2019 Max. :64.49 Max. : -52.97
## NA's :868
turbines <- turbines %>%
mutate(commissioning_date = na_mean(commissioning_date))
summary(turbines)
## objectid province_territory project_name total_project_capacity_mw
## Min. : 1 Length:6698 Length:6698 Min. : 0.3
## 1st Qu.:1675 Class :character Class :character 1st Qu.: 70.5
## Median :3350 Mode :character Mode :character Median :102.0
## Mean :3350 Mean :134.8
## 3rd Qu.:5024 3rd Qu.:181.5
## Max. :6698 Max. :350.0
##
## turbine_identifier turbine_number_in_project turbine_rated_capacity_k_w
## Length:6698 Length:6698 Min. : 65
## Class :character Class :character 1st Qu.:1600
## Mode :character Mode :character Median :1880
## Mean :1967
## 3rd Qu.:2300
## Max. :3750
## NA's :220
## rotor_diameter_m hub_height_m manufacturer model
## Min. : 15.00 Min. : 24.50 Length:6698 Length:6698
## 1st Qu.: 80.00 1st Qu.: 80.00 Class :character Class :character
## Median : 90.00 Median : 80.00 Mode :character Mode :character
## Mean : 88.62 Mean : 83.34
## 3rd Qu.:100.00 3rd Qu.: 92.00
## Max. :141.00 Max. :132.00
##
## commissioning_date latitude longitude notes
## Min. :1993 Min. :42.00 Min. :-135.23 Length:6698
## 1st Qu.:2009 1st Qu.:43.98 1st Qu.: -84.41 Class :character
## Median :2011 Median :46.67 Median : -80.67 Mode :character
## Mean :2011 Mean :46.76 Mean : -83.03
## 3rd Qu.:2014 3rd Qu.:49.17 3rd Qu.: -67.85
## Max. :2019 Max. :64.49 Max. : -52.97
##
Como sólo contiene el año, convertí la variable commissioning_date a numérica para poder utilizarla en la regresión, sin embargo al hacerlo obtuve más de 800 observaciones con NA, por lo que utilicé una función para convertir esas observaciones en la media de la columna. Las columnas rotor_diameter_m y hub_height_m no tienen valores omitidos (NA) de acuerdo al summary del dataset, por lo que no necesitan limpieza, solo me faltaría revisar cómo aparece la columna model y creo que puedo empezar a trabajar con el dataset sin mayor preprocesado manual de mi parte.
distinct(turbines, model)
## # A tibble: 92 x 1
## model
## <chr>
## 1 AN 150/30
## 2 V44/600
## 3 V47/660
## 4 N60/1300
## 5 E-40/600
## 6 V80/1800
## 7 V90/3000
## 8 GE 1.5SLE
## 9 LW750-52
## 10 GE 1.6-100
## # … with 82 more rows
distinct(turbines, manufacturer,model)
## # A tibble: 92 x 2
## manufacturer model
## <chr> <chr>
## 1 Bonus AN 150/30
## 2 Vestas V44/600
## 3 Vestas V47/660
## 4 Nordex N60/1300
## 5 Enercon E-40/600
## 6 Vestas V80/1800
## 7 Vestas V90/3000
## 8 GE GE 1.5SLE
## 9 Lagerwey LW750-52
## 10 GE GE 1.6-100
## # … with 82 more rows
turbines <- turbines %>%
mutate(manufacturer = as.factor(manufacturer))
turbines <- turbines %>%
mutate(model = as.factor(model))
class(turbines$manufacturer)
## [1] "factor"
class(turbines$model)
## [1] "factor"
Quisiera hacer un scatter plot de las dos variables que considero más relevantes, haciendo distinción del manufacturero por color:
turbines %>% ggplot(aes(x = rotor_diameter_m, y = hub_height_m,color = manufacturer)) + geom_point()
Pues creo que estoy listo para empezar a trabajar con el dataset, sólo quisiera seleccionar las variables que creo relevantes y omitir las demás que creo no aportan al análisis:
turbines <- turbines %>% select(capacity = turbine_rated_capacity_k_w,rotor_diameter_m,hub_height_m,manufacturer,commissioning_date)
colnames(turbines)
## [1] "capacity" "rotor_diameter_m" "hub_height_m"
## [4] "manufacturer" "commissioning_date"
summary(turbines)
## capacity rotor_diameter_m hub_height_m manufacturer
## Min. : 65 Min. : 15.00 Min. : 24.50 Vestas :1834
## 1st Qu.:1600 1st Qu.: 80.00 1st Qu.: 80.00 GE :1725
## Median :1880 Median : 90.00 Median : 80.00 Siemens :1248
## Mean :1967 Mean : 88.62 Mean : 83.34 Enercon : 960
## 3rd Qu.:2300 3rd Qu.:100.00 3rd Qu.: 92.00 Senvion : 643
## Max. :3750 Max. :141.00 Max. :132.00 NEG Micon: 132
## NA's :220 (Other) : 156
## commissioning_date
## Min. :1993
## 1st Qu.:2009
## Median :2011
## Mean :2011
## 3rd Qu.:2014
## Max. :2019
##
nrow(turbines)
## [1] 6698
Apenas me estoy dando cuenta que hay 220 registros con NA en la variable que queremos predecir, por lo que creo que para efectos de modelar correctamente debemos eliminar esos registros dado que no representan un gran porcentaje de los datos (tenemos casi 6700 observaciones):
turbines <- turbines[complete.cases(turbines), ]
nrow(turbines)
## [1] 6478
Finalmente me gustaría confirmar gráficamente que las dos variables numéricas que considero más relevantes tienen una buena correlación con la variable a pronosticar:
turbines %>% ggplot(aes(x = rotor_diameter_m, y = capacity)) + geom_point(color="blue")
turbines %>% ggplot(aes(x = hub_height_m, y = capacity))+ geom_point(color="red")
Creo que se ve una clara relación lineal entre las variables por lo que creo que podemos obtener buenas predicciones. Comencemos con el primer modelo: *********************EJERCICIO 2********************
***********************MODELOS**********************
******************H20 random forest*****************
h2o.init(
ip = "localhost",
# -1 indica que se empleen todos los cores disponibles.
nthreads = -1,
# Máxima memoria disponible para el cluster.
max_mem_size = "6g"
)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 26 minutes
## H2O cluster timezone: UTC
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.1.3
## H2O cluster version age: 1 month and 25 days
## H2O cluster name: H2O_started_from_R_rstudio-user_ttp474
## H2O cluster total nodes: 1
## H2O cluster total memory: 5.91 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.3 (2020-10-10)
h2o.removeAll()
h2o.no_progress()
datos_h2o <- as.h2o(x = turbines, destination_frame = "datos_h2o")
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
particiones <- h2o.splitFrame(data = datos_h2o, ratios = c(0.6,0.2), seed = 1234)
datos_train_h2o <- h2o.assign(data = particiones[[1]], key = "datos_train_H2O")
datos_valid_h2o <- h2o.assign(data = particiones[[2]], key = "datos_valid_H2O")
datos_test_h2o <- h2o.assign(data = particiones[[3]], key = "datos_test_H2O")
#Comprobamos las distribuciones de los subconjuntos de entrenamiento, validación y pruebas
summary(datos_train_h2o$capacity)
## Warning in summary.H2OFrame(datos_train_h2o$capacity): Approximated
## quantiles computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## capacity
## Min. : 65
## 1st Qu.:1598
## Median :1879
## Mean :1964
## 3rd Qu.:2299
## Max. :3750
summary(datos_valid_h2o$capacity)
## Warning in summary.H2OFrame(datos_valid_h2o$capacity): Approximated
## quantiles computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## capacity
## Min. : 65
## 1st Qu.:1598
## Median :1901
## Mean :1986
## 3rd Qu.:2299
## Max. :3750
summary(datos_test_h2o$capacity)
## Warning in summary.H2OFrame(datos_test_h2o$capacity): Approximated quantiles
## computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## capacity
## Min. : 65
## 1st Qu.:1599
## Median :1799
## Mean :1959
## 3rd Qu.:2300
## Max. :3450
#Optimizacion de hiper parametros
h2o_hiperparametros_optimizados <- h2o.grid("randomForest",
y = 1,
training_frame =datos_train_h2o,
hyper_params = list(ntrees = c(1:50)),
grid_id = "h2o_rf_grid1")
gridperf1 <- h2o.getGrid(grid_id = "h2o_rf_grid1",sort_by = "rmse",decreasing = FALSE)
best_grid <- h2o.getModel(gridperf1@model_ids[[1]])
best_grid
## Model Details:
## ==============
##
## H2ORegressionModel: drf
## Model ID: h2o_rf_grid1_model_1
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 1 1 976 15
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 15 15.00000 73 73 73.00000
##
##
## H2ORegressionMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
##
## MSE: 17303
## RMSE: 131.5409
## MAE: 57.09816
## RMSLE: 0.06175073
## Mean Residual Deviance : 17303
Por lo tanto, usamos esos hiperparametros para definir nuestro modelo:
#Final Model
h2o_random_forest_model <- h2o.randomForest(
training_frame = datos_train_h2o,
validation_frame = datos_valid_h2o,
x = 2:6,
y = 1,
model_id = "Grid_datos_train_H2O_model",
ntrees = best_grid@parameters$ntrees, # número de árboles optimizados con h2o.grid()
stopping_rounds = 2,
score_each_iteration = T,
seed = 1234
)
RMSE_H2O_Random_Forest <- h2o.rmse(h2o_random_forest_model, train = FALSE, valid = FALSE, xval = FALSE)
#Predictions with H2O randomForest
predictions_h2o <- as.data.frame(h2o.predict(h2o_random_forest_model,datos_test_h2o))
h2o_test_w_pred <- as.data.frame(datos_test_h2o)
diff_turbineCapacity <- as.data.frame(predictions_h2o - h2o_test_w_pred$capacity) %>% rename(diff_capacity = predict)
h2o_test_w_pred <- cbind(h2o_test_w_pred, predictions_h2o, diff_turbineCapacity)
h2o_test_w_pred %>% ggplot(aes(x = capacity, y = diff_capacity, color=manufacturer))+
geom_point()
Podemos ver que los puntos están cercanos a 0 en el rango de 0 a 2000 Watts, pero que conforme aumenta la capacidad de la turbina tenemos errores mucho más altos, sobre todo hacia la parte negativa, por lo que deducimos que el model está subestimando la capacidad de las turbinas basado en las dimensiones del rotor y del hub, conforme se acercan a los 3000 Watts.
h2o_test_w_pred %>% ggplot(aes(x = capacity, y = diff_capacity, color=manufacturer))+
geom_point() +
labs(title = "Comparación de Capacidad real vs Predicciones H2O") +
facet_wrap(facets = vars(manufacturer), nrow = 2, scales = "free") +
theme_bw() +
theme(legend.position = "none")
En cuanto a proveedor, vemos que el modelo predice mejor las turbinas del proveedor Vestas (antepenúltmo en color rosa), teniendo la mayoría de las errores de las predicciones valores pequeños (cercanos a 0).
*********************EJERCICIO 2********************
***********************MODELOS**********************
******************H20 Decision Tree*****************
En este punto me doy cuenta de que lo que nos está solicitando en el ejercicio es implementar un modelo de Decision Tree en H2O (para lo cual nos puso la liga a un ejemplo), por lo que me dispongo a hacerlo:
datos_h2o_decisionT <- as.h2o(x = turbines, destination_frame = "datos_h2o_decisionT")
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
response = "capacity"
predictors = setdiff(colnames(datos_h2o_decisionT),
c(response,"manufacturer","model","commissioning_date")
)
particiones2 <- h2o.splitFrame(data = datos_h2o_decisionT, ratios = c(0.6,0.2), seed = 12345)
datos_train_h2o_dt <- h2o.assign(data = particiones2[[1]], key = "datos_train_h2o_dt")
datos_valid_h2o_dt <- h2o.assign(data = particiones2[[2]], key = "datos_valid_h2o_dt")
datos_test_h2o_dt <- h2o.assign(data = particiones2[[3]], key = "datos_test_h2o_dt")
#Comprobamos las distribuciones de los subconjuntos de entrenamiento, validación y pruebas
summary(datos_train_h2o_dt$capacity)
## Warning in summary.H2OFrame(datos_train_h2o_dt$capacity): Approximated
## quantiles computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## capacity
## Min. : 65
## 1st Qu.:1598
## Median :1797
## Mean :1967
## 3rd Qu.:2299
## Max. :3750
summary(datos_valid_h2o_dt$capacity)
## Warning in summary.H2OFrame(datos_valid_h2o_dt$capacity): Approximated
## quantiles computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## capacity
## Min. : 65
## 1st Qu.:1599
## Median :1900
## Mean :1962
## 3rd Qu.:2300
## Max. :3450
summary(datos_test_h2o_dt$capacity)
## Warning in summary.H2OFrame(datos_test_h2o_dt$capacity): Approximated
## quantiles computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## capacity
## Min. : 65
## 1st Qu.:1598
## Median :1901
## Mean :1973
## 3rd Qu.:2299
## Max. :3750
gbm_params = list(max_depth = seq(2, 15))
gbm_grid = h2o.grid("gbm", x = predictors, y = response,
grid_id = "gbm_grid_2tree",
training_frame = datos_train_h2o_dt,
validation_frame = datos_valid_h2o_dt,
ntrees = 1, min_rows = 1,
sample_rate = 1, col_sample_rate = 1,
learn_rate = .01, seed = 12345,
hyper_params = gbm_params)
gbm_gridperf = h2o.getGrid(grid_id = "gbm_grid_2tree",
sort_by = "rmse",
decreasing = TRUE)
ggplot(as.data.frame(sapply(gbm_gridperf@summary_table, as.numeric))) +
geom_point(aes(max_depth, rmse)) +
geom_line(aes(max_depth, rmse, group=1)) +
labs(x="max depth", y="RMSE", title="Grid Search for Single Tree Models") +
theme_bw() +
theme(legend.position = "none")
## Warning in base::as.numeric(x): NAs introduced by coercion
Podemos observar que el max depth óptimo es 8, ya que es donde se estabiliza el RMSE. Ahora finalicemos el modelo:
#Final Model
h2o_decision_tree_final =
h2o.gbm(x = predictors, y = response,
training_frame = datos_train_h2o_dt,
ntrees = 1, min_rows = 1,
sample_rate = 1, col_sample_rate = 1,
max_depth = 8,
# use early stopping once the validation RMSE doesn't improve
# by at least 0.01% for 5 consecutive scoring events
stopping_rounds = 3, stopping_tolerance = 0.01,
stopping_metric = "RMSE",
seed = 12345)
## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.
TurbinesH2oTree = h2o.getModelTree(model = h2o_decision_tree_final, tree_number = 1)
################funciones para poder graficar el arbol de H2O
createDataTree <- function(h2oTree) {
h2oTreeRoot = h2oTree@root_node
dataTree = Node$new(h2oTreeRoot@split_feature)
dataTree$type = 'split'
addChildren(dataTree, h2oTreeRoot)
return(dataTree)
}
addChildren <- function(dtree, node) {
if(class(node)[1] != 'H2OSplitNode') return(TRUE)
feature = node@split_feature
id = node@id
na_direction = node@na_direction
if(is.na(node@threshold)) {
leftEdgeLabel = printValues(node@left_levels,
na_direction=='LEFT', 4)
rightEdgeLabel = printValues(node@right_levels,
na_direction=='RIGHT', 4)
}else {
leftEdgeLabel = paste("<", node@threshold,
ifelse(na_direction=='LEFT',',NA',''))
rightEdgeLabel = paste(">=", node@threshold,
ifelse(na_direction=='RIGHT',',NA',''))
}
left_node = node@left_child
right_node = node@right_child
if(class(left_node)[[1]] == 'H2OLeafNode')
leftLabel = paste("prediction:", left_node@prediction)
else
leftLabel = left_node@split_feature
if(class(right_node)[[1]] == 'H2OLeafNode')
rightLabel = paste("prediction:", right_node@prediction)
else
rightLabel = right_node@split_feature
if(leftLabel == rightLabel) {
leftLabel = paste(leftLabel, "(L)")
rightLabel = paste(rightLabel, "(R)")
}
dtreeLeft = dtree$AddChild(leftLabel)
dtreeLeft$edgeLabel = leftEdgeLabel
dtreeLeft$type = ifelse(class(left_node)[1] == 'H2OSplitNode', 'split', 'leaf')
dtreeRight = dtree$AddChild(rightLabel)
dtreeRight$edgeLabel = rightEdgeLabel
dtreeRight$type = ifelse(class(right_node)[1] == 'H2OSplitNode', 'split', 'leaf')
addChildren(dtreeLeft, left_node)
addChildren(dtreeRight, right_node)
return(FALSE)
}
printValues <- function(values, is_na_direction, n=4) {
l = length(values)
if(l == 0)
value_string = ifelse(is_na_direction, "NA", "")
else
value_string = paste0(paste0(values[1:min(n,l)], collapse = ', '),
ifelse(l > n, ",...", ""),
ifelse(is_na_direction, ", NA", ""))
return(value_string)
}
################funciones para poder graficar el arbol de H2O
TurbinesDataTree = createDataTree(TurbinesH2oTree)
GetEdgeLabel <- function(node) {return (node$edgeLabel)}
GetNodeShape <- function(node) {switch(node$type, split = "diamond", leaf = "oval")}
GetFontName <- function(node) {switch(node$type, split = 'Palatino-bold', leaf = 'Palatino')}
SetEdgeStyle(TurbinesDataTree, fontname = 'Palatino-italic',
label = GetEdgeLabel, labelfloat = TRUE,
fontsize = "26", fontcolor='royalblue4')
SetNodeStyle(TurbinesDataTree, fontname = GetFontName, shape = GetNodeShape,
fontsize = "26", fontcolor='royalblue4',
height="0.75", width="1")
SetGraphStyle(TurbinesDataTree, rankdir = "LR", dpi=70.)
Obtenemos el esquema del árbol de decisión, obtenemos las predicciones y calculamos el RMSE del model manualmente:
plot(TurbinesDataTree, output = "graph")
#Predictions with H2O Decision Tree
predictions_h2o_DT <- as.data.frame(h2o.predict(h2o_decision_tree_final,datos_test_h2o_dt))
h2o_test_w_pred_DT <- as.data.frame(datos_test_h2o_dt)
diff_turbineCapacity_DT <- as.data.frame(predictions_h2o_DT - h2o_test_w_pred_DT$capacity) %>% rename(diff_capacity = predict)
h2o_test_w_pred_DT <- cbind(h2o_test_w_pred_DT, predictions_h2o_DT, diff_turbineCapacity_DT)
summary(h2o_test_w_pred_DT)
## capacity rotor_diameter_m hub_height_m manufacturer
## Min. : 65 Min. : 15.50 Min. : 24.50 GE :342
## 1st Qu.:1600 1st Qu.: 78.50 1st Qu.: 80.00 Vestas :341
## Median :1903 Median : 90.00 Median : 80.00 Siemens :225
## Mean :1973 Mean : 88.58 Mean : 83.03 Enercon :189
## 3rd Qu.:2300 3rd Qu.:100.00 3rd Qu.: 90.00 Senvion :116
## Max. :3750 Max. :141.00 Max. :132.00 NEG Micon: 32
## (Other) : 26
## commissioning_date predict diff_capacity
## Min. :1998 Min. :1777 Min. :-1604.731
## 1st Qu.:2009 1st Qu.:1937 1st Qu.: -300.509
## Median :2011 Median :1972 Median : 71.027
## Mean :2011 Mean :1967 Mean : -6.307
## 3rd Qu.:2014 3rd Qu.:1999 3rd Qu.: 337.144
## Max. :2019 Max. :2145 Max. : 1711.769
##
RMSE = function(m, o){
sqrt(mean((m - o)^2))
}
RMSE_H2O_Decision_Tree <- RMSE(h2o_test_w_pred_DT$capacity,h2o_test_w_pred_DT$predict)
RMSE_H2O_Decision_Tree
## [1] 553.6733
RMSE_H2O_Random_Forest
## [1] 235.5947
Comparando la métrica del RMSE para el modelo que creamos de Random Forest (por error) contra el modelo solicitado en el examen, creo que hace sentido que un sólo decision tree tiene más error, ya que el random forest es una versión “mejorada” al utilizar varios árboles de decisión distintos y obteniendo el promedio de sus predicciones.
*********************EJERCICIO 2********************
****************MODELADO DE DATOS*******************
*************Decision Tree tidymodels***************
registerDoParallel(cores = detectCores() - 1)
#Separacion en datasets de training y testing
split <- sample.split(turbines$capacity, SplitRatio = 0.80)
turbines_train = subset(turbines, split == TRUE)
turbines_test = subset(turbines, split == FALSE)
dim(turbines_train)
## [1] 5182 5
dim(turbines_test)
## [1] 1296 5
#Definicion del modelo
set.seed(12345)
dt_tidymodels <- decision_tree(
mode = "regression",
tree_depth = tune(),
min_n = tune()
) %>%
set_engine(engine = "rpart")
transformer <- recipe(
formula = capacity ~ .,
data = turbines_train
) %>%
step_naomit(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes())
transformer_fit <- prep(transformer)
cv_folds <- vfold_cv(
data = turbines_train,
v = 5,
strata = capacity
)
workflow_modelado <- workflow() %>%
add_recipe(transformer) %>%
add_model(dt_tidymodels)
hiperpar_grid <- grid_max_entropy(
# Rango de búsqueda para cada hiperparámetro
tree_depth(range = c(1, 10), trans = NULL),
min_n(range = c(2, 100), trans = NULL),
# Número de combinaciones totales
size = 50
)
#Optimizacion de los hiperparametros
grid_fit <- tune_grid(
object = workflow_modelado,
resamples = cv_folds,
metrics = metric_set(rmse),
control = control_resamples(save_pred = TRUE),
# Hiperparámetros
grid = hiperpar_grid
)
stopImplicitCluster()
# Selección de los mejores hiperparámetros encontrados
best_hip <- select_best(grid_fit, metric = "rmse")
best_hip
## # A tibble: 1 x 3
## tree_depth min_n .config
## <int> <int> <chr>
## 1 7 97 Preprocessor1_Model20
#Finalizacion del modelo con hiperparametros optimizados
final_model_decisionTree_tidymodels <- finalize_model(x = dt_tidymodels, parameters = best_hip)
final_model_decisionTree_tidymodels
## Decision Tree Model Specification (regression)
##
## Main Arguments:
## tree_depth = 7
## min_n = 97
##
## Computational engine: rpart
final_model_decisionTree_tidymodels_fit <- finalize_workflow(
x = workflow_modelado,
parameters = best_hip
) %>%
fit(data = turbines_train) %>%
pull_workflow_fit()
data_test_prep <- bake(transformer_fit, turbines_test)
#Predicciones sobre el dataset de testing
predicciones <- final_model_decisionTree_tidymodels_fit %>%
predict(
new_data = data_test_prep,
type = "numeric"
)
predicciones <- predicciones %>%
bind_cols(data_test_prep %>% select(capacity))
Como podemos observar, nuestras predicciones no son tan buenas como un random forest, pero el decision tree de tidymodels obtiene definitivamente mejores predicciones que el decision tree de H2O.
Me gustaría destacar que tuve que quitar la variable de model del dataset pues al convertirla a variable dummy, el comando de recipe me estaba dando un error sobre columnas duplicadas las cuales eran nombres de modelos. El mensaje de error me sugería que hiciera .name_repair pero no supe cómo, así que preferí quitarla y con eso se eliminó el error.
rmse(predicciones, truth = capacity, estimate = .pred, na_rm = TRUE)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 198.
*********************EJERCICIO 2********************
****************MODELADO DE DATOS*******************
*************RPart con poda del árbol***************